This R Markdown document reads, combines, and summarizes multiple _summary.csv files. ***

0. Packages

This section loads the packages used for import, manipulation, and plotting.

library(tidyverse)

# Make plots larger and labels more legible throughout the report
knitr::opts_chunk$set(
  fig.width = 16,
  fig.height = 8,
  dpi = 200,
  out.width = "100%",
  fig.align = "center"
)

# Ensure rmarkdown can find Pandoc.
# On Windows, Quarto bundles pandoc in: <quarto>/bin/tools/pandoc.exe.
if (!nzchar(Sys.which("pandoc"))) {
  quarto_exe <- Sys.which("quarto")
  if (nzchar(quarto_exe)) {
    quarto_tools <- normalizePath(
      file.path(dirname(quarto_exe), "tools"),
      winslash = "/",
      mustWork = FALSE
    )
    if (dir.exists(quarto_tools)) {
      Sys.setenv(RSTUDIO_PANDOC = quarto_tools)
    }
  }
}

1. Read and combine ALL CSVs

Reads all _summary.csv files from folder and combines them into a single master table dat_raw.

folders <- c(
  "C:\\Users\\dunnmk\\University of Michigan Dropbox\\MED-WILSONTELAB\\wilsontelab box\\Common\\Projects\\Yeast Aim 3\\3C Sequencing Data\\3C_summary",
  "C:\\Users\\dunnmk\\University of Michigan Dropbox\\MED-WILSONTELAB\\wilsontelab box\\Common\\Projects\\Yeast Aim 3\\Sequencing\\PD Data"
)

files <- unlist(lapply(
  folders,
  function(folder) list.files(
    folder,
    pattern = "(_summary|_collapsed_summary)\\.csv$",
    full.names = TRUE
  )
))

raw_list <- lapply(files, function(f) {
  dat <- read_csv(f)
  if (!("repeat" %in% names(dat))) {
    dat <- dat %>% mutate(`repeat` = "ALL")
  }
  dat %>%
    mutate(
      source_file = basename(f),
      `repeat` = as.character(`repeat`)
    )
})
## Rows: 76 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): DSB2_loci, alignment_name, cis_trans, DSB, combo, allele
## dbl (4): batch, time_point, replicate, count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 76 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): DSB2_loci, alignment_name, cis_trans, DSB, combo, allele
## dbl (4): batch, time_point, replicate, count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 74 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): DSB2_loci, alignment_name, cis_trans, DSB, combo, allele
## dbl (4): batch, time_point, replicate, count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 74 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): DSB2_loci, alignment_name, cis_trans, DSB, combo, allele
## dbl (4): batch, time_point, replicate, count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 74 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): DSB2_loci, alignment_name, cis_trans, DSB, combo, allele
## dbl (4): batch, time_point, replicate, count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 74 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): DSB2_loci, alignment_name, cis_trans, DSB, combo, allele
## dbl (4): batch, time_point, replicate, count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 153 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): DSB2_loci, alignment_name, cis_trans, DSB, combo, repeat, allele
## dbl (4): batch, time_point, replicate, count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 184 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): DSB2_loci, alignment_name, cis_trans, DSB, combo, repeat, allele
## dbl (4): batch, time_point, replicate, count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 139 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): DSB2_loci, alignment_name, cis_trans, DSB, combo, repeat, allele
## dbl (4): batch, time_point, replicate, count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 183 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): DSB2_loci, alignment_name, cis_trans, DSB, combo, repeat, allele
## dbl (4): batch, time_point, replicate, count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dat_raw <- bind_rows(raw_list)
dat_raw <- dat_raw %>%
  mutate(batch = factor(batch, levels = sort(unique(batch))))
str(dat_raw)
## tibble [1,107 × 12] (S3: tbl_df/tbl/data.frame)
##  $ batch         : Factor w/ 3 levels "4","6","8": 1 1 1 1 1 1 1 1 1 1 ...
##  $ DSB2_loci     : chr [1:1107] "chr7" "chr7" "chr7" "chr7" ...
##  $ time_point    : num [1:1107] 0 0 0 0 0 0 0 0 0 0 ...
##  $ replicate     : num [1:1107] 1 1 1 1 1 1 1 1 1 1 ...
##  $ alignment_name: chr [1:1107] "CIS_A_to_B_DSB1_Chr12_L01" "CIS_A_to_B_DSB1_Chr12_L05" "CIS_A_to_B_DSB1_Chr12_L07" "CIS_A_to_B_DSB1_Chr12_L12" ...
##  $ cis_trans     : chr [1:1107] "CIS" "CIS" "CIS" "CIS" ...
##  $ DSB           : chr [1:1107] "DSB1" "DSB1" "DSB1" "DSB1" ...
##  $ combo         : chr [1:1107] "A_to_B" "A_to_B" "A_to_B" "A_to_B" ...
##  $ allele        : chr [1:1107] "Chr12_L01" "Chr12_L05" "Chr12_L07" "Chr12_L12" ...
##  $ count         : num [1:1107] 23365 15119 19481 22779 27868 ...
##  $ repeat        : chr [1:1107] "ALL" "ALL" "ALL" "ALL" ...
##  $ source_file   : chr [1:1107] "batch4_T0_3C_summary.csv" "batch4_T0_3C_summary.csv" "batch4_T0_3C_summary.csv" "batch4_T0_3C_summary.csv" ...
dat_raw |> head(10) |> knitr::kable(caption = "Preview: combined raw `_summary.csv` rows (first 10)")
Preview: combined raw _summary.csv rows (first 10)
batch DSB2_loci time_point replicate alignment_name cis_trans DSB combo allele count repeat source_file
4 chr7 0 1 CIS_A_to_B_DSB1_Chr12_L01 CIS DSB1 A_to_B Chr12_L01 23365 ALL batch4_T0_3C_summary.csv
4 chr7 0 1 CIS_A_to_B_DSB1_Chr12_L05 CIS DSB1 A_to_B Chr12_L05 15119 ALL batch4_T0_3C_summary.csv
4 chr7 0 1 CIS_A_to_B_DSB1_Chr12_L07 CIS DSB1 A_to_B Chr12_L07 19481 ALL batch4_T0_3C_summary.csv
4 chr7 0 1 CIS_A_to_B_DSB1_Chr12_L12 CIS DSB1 A_to_B Chr12_L12 22779 ALL batch4_T0_3C_summary.csv
4 chr7 0 1 CIS_A_to_B_DSB1_Chr15_L01 CIS DSB1 A_to_B Chr15_L01 27868 ALL batch4_T0_3C_summary.csv
4 chr7 0 1 CIS_A_to_B_DSB1_Chr15_L04 CIS DSB1 A_to_B Chr15_L04 54 ALL batch4_T0_3C_summary.csv
4 chr7 0 1 CIS_A_to_B_DSB1_Chr15_L15 CIS DSB1 A_to_B Chr15_L15 14456 ALL batch4_T0_3C_summary.csv
4 chr7 0 1 CIS_A_to_B_DSB1_Chr15_L17_2 CIS DSB1 A_to_B Chr15_L17_2 28338 ALL batch4_T0_3C_summary.csv
4 chr7 0 1 CIS_A_to_B_DSB1_Chr15_L18 CIS DSB1 A_to_B Chr15_L18 4689 ALL batch4_T0_3C_summary.csv
4 chr7 0 1 CIS_A_to_B_DSB1_Chr3_L02 CIS DSB1 A_to_B Chr3_L02 26669 ALL batch4_T0_3C_summary.csv

2. Aggregate counts and percentages

Purpose- Calculate percentages from aggregated counts

summarize_counts_pct <- function(dat){
  dat %>%
    summarise(
      Total_Counts   = sum(count),
      Cis_Counts     = sum(count[cis_trans == "CIS"]),
      Trans_Counts   = sum(count[cis_trans == "TRANS"]),
      A_to_B_Counts  = sum(count[combo == "A_to_B"]),
      C_to_D_Counts  = sum(count[combo == "C_to_D"]),
      A_to_D_Counts  = sum(count[combo == "A_to_D"]),
      C_to_B_Counts  = sum(count[combo == "C_to_B"]),
      Percent_Cis    = if_else(Total_Counts > 0, 100 * Cis_Counts / Total_Counts, NA_real_),
      Percent_Trans  = if_else(Total_Counts > 0, 100 * Trans_Counts / Total_Counts, NA_real_),
      Percent_A_to_B = if_else(Total_Counts > 0, 100 * A_to_B_Counts / Total_Counts, NA_real_),
      Percent_C_to_D = if_else(Total_Counts > 0, 100 * C_to_D_Counts / Total_Counts, NA_real_),
      Percent_A_to_D = if_else(Total_Counts > 0, 100 * A_to_D_Counts / Total_Counts, NA_real_),
      Percent_C_to_B = if_else(Total_Counts > 0, 100 * C_to_B_Counts / Total_Counts, NA_real_),
      .groups = "drop"
    )
}

dat_agg_counts <- dat_raw %>%
  group_by(batch, time_point, DSB) %>%
  summarize_counts_pct()

str(dat_agg_counts)
## tibble [24 × 16] (S3: tbl_df/tbl/data.frame)
##  $ batch         : Factor w/ 3 levels "4","6","8": 1 1 1 1 1 1 2 2 2 2 ...
##  $ time_point    : num [1:24] 0 0 0 120 120 120 0 0 0 120 ...
##  $ DSB           : chr [1:24] "DSB1" "DSB2" "TRANS" "DSB1" ...
##  $ Total_Counts  : num [1:24] 499775 458690 3754 538320 483621 ...
##  $ Cis_Counts    : num [1:24] 499775 458690 0 538320 483621 ...
##  $ Trans_Counts  : num [1:24] 0 0 3754 0 0 ...
##  $ A_to_B_Counts : num [1:24] 499775 0 0 538320 0 ...
##  $ C_to_D_Counts : num [1:24] 0 458690 0 0 483621 ...
##  $ A_to_D_Counts : num [1:24] 0 0 3061 0 0 ...
##  $ C_to_B_Counts : num [1:24] 0 0 693 0 0 ...
##  $ Percent_Cis   : num [1:24] 100 100 0 100 100 0 100 100 0 100 ...
##  $ Percent_Trans : num [1:24] 0 0 100 0 0 100 0 0 100 0 ...
##  $ Percent_A_to_B: num [1:24] 100 0 0 100 0 0 100 0 0 100 ...
##  $ Percent_C_to_D: num [1:24] 0 100 0 0 100 0 0 100 0 0 ...
##  $ Percent_A_to_D: num [1:24] 0 0 81.5 0 0 ...
##  $ Percent_C_to_B: num [1:24] 0 0 18.5 0 0 ...
dat_agg_counts |> head(10) |> knitr::kable(caption = "Aggregated counts + derived percentages by batch × time_point × DSB (first 10)")
Aggregated counts + derived percentages by batch × time_point × DSB (first 10)
batch time_point DSB Total_Counts Cis_Counts Trans_Counts A_to_B_Counts C_to_D_Counts A_to_D_Counts C_to_B_Counts Percent_Cis Percent_Trans Percent_A_to_B Percent_C_to_D Percent_A_to_D Percent_C_to_B
4 0 DSB1 499775 499775 0 499775 0 0 0 100 0 100 0 0.00000 0.00000
4 0 DSB2 458690 458690 0 0 458690 0 0 100 0 0 100 0.00000 0.00000
4 0 TRANS 3754 0 3754 0 0 3061 693 0 100 0 0 81.53969 18.46031
4 120 DSB1 538320 538320 0 538320 0 0 0 100 0 100 0 0.00000 0.00000
4 120 DSB2 483621 483621 0 0 483621 0 0 100 0 0 100 0.00000 0.00000
4 120 TRANS 31673 0 31673 0 0 18323 13350 0 100 0 0 57.85054 42.14946
6 0 DSB1 1461335 1461335 0 1461335 0 0 0 100 0 100 0 0.00000 0.00000
6 0 DSB2 1945011 1945011 0 0 1945011 0 0 100 0 0 100 0.00000 0.00000
6 0 TRANS 18616 0 18616 0 0 15477 3139 0 100 0 0 83.13816 16.86184
6 120 DSB1 169434 169434 0 169434 0 0 0 100 0 100 0 0.00000 0.00000

The table above is now aggregated to batch × time_point × DSB. ***

2.5 Batch-level QC plots (counts + composition)

Purpose- Quick sanity-check plots before allele-level cis/trans normalization.

# Aggregate from the master raw table
agg_qc <- dat_raw %>%
  group_by(batch, time_point, DSB) %>%
  summarize_counts_pct()

# From here on in QC, compute CIS/TRANS using ONLY the 4 combos of interest:
#   CIS   := A_to_B + C_to_D
#   TRANS := A_to_D + C_to_B
# and define percents relative to the total of these 4 combos.
#
# IMPORTANT: we intentionally DO NOT group by `DSB` here.
# In this dataset, `DSB` can include levels like "TRANS", and grouping by it
# will split CIS and TRANS into different facets and produce misleading 100% bars.
agg_qc_combo4 <- dat_raw %>%
  group_by(batch, time_point) %>%
  summarise(
    Total_All = sum(count),
    A_to_B = sum(count[combo == "A_to_B"]),
    C_to_D = sum(count[combo == "C_to_D"]),
    A_to_D = sum(count[combo == "A_to_D"]),
    C_to_B = sum(count[combo == "C_to_B"]),
    .groups = "drop"
  ) %>%
  mutate(
    Cis_Counts = A_to_B + C_to_D,
    Trans_Counts = A_to_D + C_to_B,
    Total_4Combos = Cis_Counts + Trans_Counts,
    Excluded_Counts = pmax(Total_All - Total_4Combos, 0),
    Percent_Cis = if_else(Total_4Combos > 0, 100 * Cis_Counts / Total_4Combos, NA_real_),
    Percent_Trans = if_else(Total_4Combos > 0, 100 * Trans_Counts / Total_4Combos, NA_real_),
    Percent_A_to_B_in_Cis = if_else(Cis_Counts > 0, 100 * A_to_B / Cis_Counts, NA_real_),
    Percent_C_to_D_in_Cis = if_else(Cis_Counts > 0, 100 * C_to_D / Cis_Counts, NA_real_),
    Percent_A_to_D_in_Trans = if_else(Trans_Counts > 0, 100 * A_to_D / Trans_Counts, NA_real_),
    Percent_C_to_B_in_Trans = if_else(Trans_Counts > 0, 100 * C_to_B / Trans_Counts, NA_real_)
  )

# Quick diagnostic table: if Excluded_Counts is large, then the 4-combo definition is omitting real counts.
agg_qc_combo4 %>%
  arrange(time_point, batch) %>%
  transmute(
    batch, time_point,
    Total_All,
    Total_4Combos,
    Excluded_Counts,
    Percent_Cis,
    Percent_Trans
  ) %>%
  head(20) %>%
  knitr::kable(caption = "QC (4-combo definition): totals vs excluded counts, plus CIS%/TRANS% (first 20 rows)")
QC (4-combo definition): totals vs excluded counts, plus CIS%/TRANS% (first 20 rows)
batch time_point Total_All Total_4Combos Excluded_Counts Percent_Cis Percent_Trans
4 0 962219 962219 0 99.60986 0.3901399
6 0 3424962 3424962 0 99.45646 0.5435389
8 0 1946822 1946822 0 99.18847 0.8115277
4 120 1053614 1053614 0 96.99387 3.0061294
6 120 253131 253131 0 95.00496 4.9950421
8 120 195628 195628 0 95.88300 4.1169976
6 180 3539160 3539160 0 93.49416 6.5058375
8 180 3492432 3492432 0 92.71874 7.2812584
# Formatting helpers (avoid extra package dependencies)
comma_label <- function(x) {
  ifelse(is.na(x), NA_character_, formatC(x, format = "f", digits = 0, big.mark = ","))
}

pct_label <- function(x, digits = 1) {
  ifelse(is.na(x), NA_character_, paste0(round(x, digits), "%"))
}

# 1) Total counts by batch
p_total_counts <- ggplot(agg_qc, aes(x = batch, y = Total_Counts, fill = DSB)) +
  geom_col(position = position_dodge(width = 0.8), width = 0.75) +
  geom_text(
    aes(label = comma_label(Total_Counts)),
    position = position_dodge(width = 0.8),
    vjust = -0.25,
    size = 2.7
  ) +
  facet_wrap(~ time_point, scales = "free_y") +
  theme_bw() +
  labs(
    title = "Total counts by batch (faceted by time_point)",
    x = "Batch",
    y = "Total counts",
    fill = "DSB"
  )

if (nrow(agg_qc) > 0) {
  print(p_total_counts)
}

# 2) CIS-only counts by category within each batch (4-combo definition)
cis_counts_long <- agg_qc_combo4 %>%
  select(batch, time_point, Cis_Counts, A_to_B, C_to_D) %>%
  pivot_longer(
    cols = -c(batch, time_point),
    names_to = "Metric",
    values_to = "Counts"
  ) %>%
  mutate(
    Metric = factor(
      Metric,
      levels = c("Cis_Counts", "A_to_B", "C_to_D"),
      labels = c("CIS (total)", "A to B (cis)", "C to D (cis)")
    )
  )

p_cis_counts <- ggplot(cis_counts_long, aes(x = batch, y = Counts, fill = Metric)) +
  geom_col(position = position_dodge(width = 0.85), width = 0.8) +
  geom_text(
    aes(label = comma_label(Counts)),
    position = position_dodge(width = 0.85),
    vjust = -0.25,
    size = 2.4
  ) +
  facet_wrap(~ time_point, scales = "free_y") +
  theme_bw() +
  labs(
    title = "CIS counts by category per batch",
    x = "Batch",
    y = "Counts",
    fill = ""
  )

if (nrow(cis_counts_long) > 0) {
  print(p_cis_counts)
}

# 3) TRANS-only counts by category within each batch (4-combo definition)
trans_counts_long <- agg_qc_combo4 %>%
  select(batch, time_point, Trans_Counts, A_to_D, C_to_B) %>%
  pivot_longer(
    cols = -c(batch, time_point),
    names_to = "Metric",
    values_to = "Counts"
  ) %>%
  mutate(
    Metric = factor(
      Metric,
      levels = c("Trans_Counts", "A_to_D", "C_to_B"),
      labels = c("TRANS (total)", "A to D (trans)", "C to B (trans)")
    )
  )

p_trans_counts <- ggplot(trans_counts_long, aes(x = batch, y = Counts, fill = Metric)) +
  geom_col(position = position_dodge(width = 0.85), width = 0.8) +
  geom_text(
    aes(label = comma_label(Counts)),
    position = position_dodge(width = 0.85),
    vjust = -0.25,
    size = 2.4
  ) +
  facet_wrap(~ time_point, scales = "free_y") +
  theme_bw() +
  labs(
    title = "TRANS counts by category per batch",
    x = "Batch",
    y = "Counts",
    fill = ""
  )

if (nrow(trans_counts_long) > 0) {
  print(p_trans_counts)
}

# 4) CIS vs TRANS percent for each batch (single plot; bars side-by-side)
# Percent_Cis/Percent_Trans are computed relative to TOTAL_4Combos.
pct_cis_trans_long <- agg_qc_combo4 %>%
  select(batch, time_point, Percent_Cis, Percent_Trans) %>%
  pivot_longer(
    cols = c(Percent_Cis, Percent_Trans),
    names_to = "Class",
    values_to = "Percent"
  ) %>%
  mutate(
    Class = recode(Class, Percent_Cis = "CIS", Percent_Trans = "TRANS"),
    Label = pct_label(Percent)
  )

p_pct_cis_trans <- ggplot(pct_cis_trans_long, aes(x = batch, y = Percent, fill = Class)) +
  geom_col(position = position_dodge(width = 0.85), width = 0.8) +
  geom_text(
    aes(label = Label),
    position = position_dodge(width = 0.85),
    vjust = -0.25,
    size = 3.2
  ) +
  facet_wrap(~ time_point) +
  scale_y_continuous(limits = c(0, 110)) +
  theme_bw() +
  labs(
    title = "QC: CIS vs TRANS percent per batch (within group)",
    x = "Batch",
    y = "% of total counts",
    fill = ""
  )

if (nrow(pct_cis_trans_long) > 0 && any(!is.na(pct_cis_trans_long$Percent))) {
  print(p_pct_cis_trans)
}

# 5b) Percent TRANS per batch at T0 vs T120 (single plot; bars side-by-side)
pct_trans_time_compare <- agg_qc_combo4 %>%
  filter(time_point %in% c(0, 120)) %>%
  transmute(
    batch,
    time_point = factor(time_point, levels = c(0, 120), labels = c("T0", "T120")),
    Percent_Trans,
    Label = pct_label(Percent_Trans)
  )

p_pct_trans_t0_t120 <- ggplot(pct_trans_time_compare, aes(x = batch, y = Percent_Trans, fill = time_point)) +
  geom_col(position = position_dodge(width = 0.85), width = 0.8) +
  geom_text(
    aes(label = Label),
    position = position_dodge(width = 0.85),
    vjust = -0.25,
    size = 3.2
  ) +
  scale_y_continuous(limits = c(0, 110)) +
  theme_bw() +
  labs(
    title = "QC: TRANS% per batch (T0 vs T120)",
    x = "Batch",
    y = "TRANS% of total",
    fill = "Time"
  )

if (nrow(pct_trans_time_compare) > 0 && any(!is.na(pct_trans_time_compare$Percent_Trans))) {
  print(p_pct_trans_t0_t120)
}

# 6) Within-class combo composition (stacked; should sum to 100% within class)
# Use ONLY the two combos per class (A_to_B/C_to_D within CIS; A_to_D/C_to_B within TRANS)
agg_combo_within_2only <- agg_qc_combo4

cis_combo_long <- agg_combo_within_2only %>%
  select(batch, time_point, Percent_A_to_B_in_Cis, Percent_C_to_D_in_Cis) %>%
  pivot_longer(
    cols = c(Percent_A_to_B_in_Cis, Percent_C_to_D_in_Cis),
    names_to = "Combo",
    values_to = "Percent"
  ) %>%
  mutate(
    Combo = recode(
      Combo,
      Percent_A_to_B_in_Cis = "A_to_B (within CIS)",
      Percent_C_to_D_in_Cis = "C_to_D (within CIS)"
    )
  )

p_cis_combo_comp <- ggplot(cis_combo_long, aes(x = batch, y = Percent, fill = Combo)) +
  geom_col(width = 0.75) +
  geom_text(
    aes(label = if_else(is.na(Percent) | Percent <= 0, NA_character_, paste0(round(Percent, 1), "%"))),
    position = position_stack(vjust = 0.5),
    size = 2.4
  ) +
  facet_wrap(~ time_point) +
  scale_y_continuous(limits = c(0, 100)) +
  theme_bw() +
  labs(
    title = "QC: Within-CIS composition (A_to_B + C_to_D = 100%)",
    x = "Batch",
    y = "% of CIS",
    fill = ""
  )

if (nrow(cis_combo_long) > 0 && any(!is.na(cis_combo_long$Percent))) {
  print(p_cis_combo_comp)
}

trans_combo_long <- agg_combo_within_2only %>%
  select(batch, time_point, Percent_A_to_D_in_Trans, Percent_C_to_B_in_Trans) %>%
  pivot_longer(
    cols = c(Percent_A_to_D_in_Trans, Percent_C_to_B_in_Trans),
    names_to = "Combo",
    values_to = "Percent"
  ) %>%
  mutate(
    Combo = recode(
      Combo,
      Percent_A_to_D_in_Trans = "A_to_D (within TRANS)",
      Percent_C_to_B_in_Trans = "C_to_B (within TRANS)"
    )
  )

p_trans_combo_comp <- ggplot(trans_combo_long, aes(x = batch, y = Percent, fill = Combo)) +
  geom_col(width = 0.75) +
  geom_text(
    aes(label = if_else(is.na(Percent) | Percent <= 0, NA_character_, paste0(round(Percent, 1), "%"))),
    position = position_stack(vjust = 0.5),
    size = 2.4
  ) +
  facet_wrap(~ time_point) +
  scale_y_continuous(limits = c(0, 100)) +
  theme_bw() +
  labs(
    title = "QC: Within-TRANS composition (A_to_D + C_to_B = 100%)",
    x = "Batch",
    y = "% of TRANS",
    fill = ""
  )

if (nrow(trans_combo_long) > 0 && any(!is.na(trans_combo_long$Percent))) {
  print(p_trans_combo_comp)
}

3. cis/trans normalization — compute totals and percentages per allele

my_summarize_cistrans <- function(dat){
  by_allele <- dat %>%
    group_by(batch, time_point, DSB, allele) %>%
    summarise(
      Cis_Location_Counts   = sum(count[cis_trans == "CIS"]),
      Trans_Location_Counts = sum(count[cis_trans == "TRANS"]),
      .groups = "drop"
    )

  totals <- by_allele %>%
    group_by(batch, time_point, DSB) %>%
    summarise(
      Total_Cis_Location_Counts   = sum(Cis_Location_Counts),
      Total_Trans_Location_Counts = sum(Trans_Location_Counts),
      .groups = "drop"
    )

  by_allele %>%
    left_join(totals, by = c("batch", "time_point", "DSB")) %>%
    mutate(
      Percent_Location_in_Cis   = if_else(Total_Cis_Location_Counts > 0,
                                         100 * Cis_Location_Counts / Total_Cis_Location_Counts,
                                         NA_real_),
      Percent_Location_in_Trans = if_else(Total_Trans_Location_Counts > 0,
                                         100 * Trans_Location_Counts / Total_Trans_Location_Counts,
                                         NA_real_)
    )
}

dat_norm <- my_summarize_cistrans(dat_raw)
dat_norm |> head(10) |> knitr::kable(caption = "Preview: allele-level CIS/TRANS counts and within-class percentages (first 10)")
Preview: allele-level CIS/TRANS counts and within-class percentages (first 10)
batch time_point DSB allele Cis_Location_Counts Trans_Location_Counts Total_Cis_Location_Counts Total_Trans_Location_Counts Percent_Location_in_Cis Percent_Location_in_Trans
4 0 DSB1 Chr12_L01 23365 0 499775 0 4.6751038 NA
4 0 DSB1 Chr12_L05 15119 0 499775 0 3.0251613 NA
4 0 DSB1 Chr12_L07 19481 0 499775 0 3.8979541 NA
4 0 DSB1 Chr12_L12 22779 0 499775 0 4.5578510 NA
4 0 DSB1 Chr15_L01 27868 0 499775 0 5.5761092 NA
4 0 DSB1 Chr15_L04 54 0 499775 0 0.0108049 NA
4 0 DSB1 Chr15_L15 14456 0 499775 0 2.8925016 NA
4 0 DSB1 Chr15_L17_2 28338 0 499775 0 5.6701516 NA
4 0 DSB1 Chr15_L18 4689 0 499775 0 0.9382222 NA
4 0 DSB1 Chr3_L02 26669 0 499775 0 5.3362013 NA

3.5 Uncuttable-normalized allele combo profiles

For this experiment, we normalize counts by the uncuttable/control signal within each (batch × time_point × DSB).

This section produces:

  • A table of per-allele, per-combo counts normalized to uncuttable.
  • A plot of % of uncuttable by allele, colored by combo, for each batch and time.
# Define how to identify NO_CUT / uncuttable reads.
# IMPORTANT: in this experiment we also have rows like "1PERCENTCONTROL".
# We therefore:
#   1) match NO_CUT labels fairly broadly (to catch variants like DSB2_NO_CUT_SSA), and
#   2) explicitly EXCLUDE any 1% spike-in control rows from the NO_CUT denominator.
uncuttable_pattern <- "NO[_\\W]*CUT|NOCUT|UNCUT"
uncuttable_regex <- stringr::regex(uncuttable_pattern, ignore_case = TRUE)

# The 1% spike-in appears in labels like "CHRXV_L18_1PERCENT_CONTROL".
percent_control_regex <- stringr::regex("1PERCENT|PERCENTCONTROL|PERCENT_CONTROL", ignore_case = TRUE)

# Determine which column actually contains NO_CUT labels.
# Different runs/pipelines sometimes store these labels in different fields.
# Include both character and factor columns.
char_like_cols <- names(dat_raw)[vapply(dat_raw, function(x) is.character(x) || is.factor(x), logical(1))]
if (length(char_like_cols) == 0) {
  stop("No character/factor columns found in dat_raw; cannot locate NO_CUT labels.")
}

nocut_field_scores <- tibble(
  field = char_like_cols,
  n_matched = vapply(
    char_like_cols,
    function(col) {
      v <- as.character(dat_raw[[col]])
      sum(stringr::str_detect(v, uncuttable_regex) & !stringr::str_detect(v, percent_control_regex), na.rm = TRUE)
    },
    numeric(1)
  )
) %>%
  arrange(desc(n_matched))

nocut_field <- nocut_field_scores$field[[1]]

nocut_field_scores %>%
  head(10) %>%
  knitr::kable(caption = "Diagnostic: candidate fields for NO_CUT labels (counts of matching rows; top 10).")
Diagnostic: candidate fields for NO_CUT labels (counts of matching rows; top 10).
field n_matched
alignment_name 78
allele 78
batch 0
DSB2_loci 0
cis_trans 0
DSB 0
combo 0
repeat 0
source_file 0
if (is.na(nocut_field_scores$n_matched[[1]]) || nocut_field_scores$n_matched[[1]] == 0) {
  warning(
    "No NO_CUT rows detected in any character field using pattern '",
    uncuttable_pattern,
    "'. Normalized values will be NA/0. Check how NO_CUT is labeled in your input CSVs."
  )
}

# Focus on the 4 key combos used throughout the report.
combos_4 <- c("A_to_B", "C_to_D", "A_to_D", "C_to_B")

# Aggregate to allele × combo within each batch/time/DSB
dat_allele_combo_counts <- dat_raw %>%
  filter(combo %in% combos_4) %>%
  group_by(batch, time_point, replicate, DSB, `repeat`, allele, combo) %>%
  summarise(Counts = sum(count, na.rm = TRUE), .groups = "drop") %>%
  mutate(combo = factor(combo, levels = combos_4))

# Compute DSB-specific NO_CUT denominator per batch/time/DSB.
# If NO_CUT exists in multiple categories/classes for a given DSB, this sums them,
# yielding the DSB-specific NO_CUT total within each (batch × time_point × DSB).
dat_uncuttable_denoms <- dat_raw %>%
  group_by(batch, time_point, replicate, DSB, `repeat`) %>%
  summarise(
    Uncuttable_Counts = sum(
      if_else(
        stringr::str_detect(.data[[nocut_field]], uncuttable_regex) &
          !stringr::str_detect(.data[[nocut_field]], percent_control_regex),
        count,
        0
      ),
      na.rm = TRUE
    ),
    .groups = "drop"
  )

# Diagnostic: how many groups have a usable NO_CUT denominator?
dat_uncuttable_denoms %>%
  summarise(
    Groups = n(),
    Groups_with_NO_CUT = sum(Uncuttable_Counts > 0, na.rm = TRUE),
    Groups_with_NO_CUT_zero = sum(Uncuttable_Counts <= 0, na.rm = TRUE),
    Min_NO_CUT = suppressWarnings(min(Uncuttable_Counts, na.rm = TRUE)),
    Median_NO_CUT = suppressWarnings(median(Uncuttable_Counts, na.rm = TRUE)),
    Max_NO_CUT = suppressWarnings(max(Uncuttable_Counts, na.rm = TRUE))
  ) %>%
  knitr::kable(caption = "Diagnostic: summary of NO_CUT denominators across batch × time_point × DSB groups")
Diagnostic: summary of NO_CUT denominators across batch × time_point × DSB groups
Groups Groups_with_NO_CUT Groups_with_NO_CUT_zero Min_NO_CUT Median_NO_CUT Max_NO_CUT
42 42 0 20 8441 127246
# If denominators are all zero, show candidate values containing CUT/NOCUT/UNCUT to help identify labeling.
if (all(dat_uncuttable_denoms$Uncuttable_Counts <= 0, na.rm = TRUE)) {
  dat_raw %>%
    mutate(.nocut_field_value = as.character(.data[[nocut_field]])) %>%
    filter(stringr::str_detect(.nocut_field_value, stringr::regex("CUT|NOCUT|NO[_\\W]*CUT|UNCUT|INTACT", ignore_case = TRUE))) %>%
    count(Value = .nocut_field_value, sort = TRUE, name = "Row_Count") %>%
    head(30) %>%
    knitr::kable(caption = paste0(
      "No positive NO_CUT denominators found. Here are common values containing CUT/NOCUT/UNCUT/INTACT in field '",
      nocut_field,
      "' (first 30). Use this to refine the NO_CUT matcher."
    ))
}

# Diagnostic: list the most common allele labels contributing to NO_CUT denominators.
dat_raw %>%
  filter(stringr::str_detect(.data[[nocut_field]], uncuttable_regex), !stringr::str_detect(.data[[nocut_field]], percent_control_regex)) %>%
  count(Value = .data[[nocut_field]], wt = count, sort = TRUE, name = "Total_Counts") %>%
  head(20) %>%
  knitr::kable(caption = paste0(
    "Top allele labels matched as NO_CUT (weighted by count; first 20). ",
    "Matching field used: ", nocut_field, ". ",
    "If this is empty, the NO_CUT matcher is not finding your labels."
  ))
Top allele labels matched as NO_CUT (weighted by count; first 20). Matching field used: alignment_name. If this is empty, the NO_CUT matcher is not finding your labels.
Value Total_Counts
CIS_DSB1_NO_REPEAT_NO_CUT 241530
CIS_A_to_B_DSB1_UNCUTTABLE 195175
CIS_C_to_D_DSB2_UNCUTTABLE 121969
CIS_DSB2_NO_REPEAT_NO_CUT 111129
CIS_DSB1_FULL_NO_CUT 71311
CIS_DSB2_FULL_NO_CUT 41151
TRANS_A_to_D_UNCUTTABLE 5025
TRANS_DSB1A_DSB2D_NO_REPEAT_DSB1_NO_CUT 2065
TRANS_DSB1A_DSB2D_FULL_DSB1_NO_CUT 1130
TRANS_DSB2C_DSB1B_NO_REPEAT_DSB2_NO_CUT 338
TRANS_DSB2C_DSB1B_FULL_DSB2_NO_CUT 251
CIS_DSB2_FULL_DSB1_NO_CUT 248
TRANS_C_to_B_UNCUTTABLE 209
CIS_DSB2_NO_REPEAT_DSB1_NO_CUT 116
CIS_DSB1_FULL_DSB2_NO_CUT 66
CIS_DSB1_NO_REPEAT_DSB2_NO_CUT 46
TRANS_DSB1A_DSB2D_NO_REPEAT_DSB2_NO_CUT 9
TRANS_DSB2C_DSB1B_NO_REPEAT_DSB1_NO_CUT 3
TRANS_DSB2C_DSB1B_FULL_DSB1_NO_CUT 2
TRANS_DSB1A_DSB2D_FULL_DSB2_NO_CUT 1
# Diagnostic: show the actual NO_CUT denominators being used.
# This confirms, for example, that Batch 4 DSB1 is normalized by Batch 4 DSB1 NO_CUT counts (per time_point).
dat_uncuttable_denoms %>%
  arrange(batch, time_point, replicate, DSB, `repeat`) %>%
  head(20) %>%
  knitr::kable(caption = "Uncuttable denominators (NO_CUT): Uncuttable_Counts per batch × time_point × replicate × DSB × repeat-class (first 20 rows)")
Uncuttable denominators (NO_CUT): Uncuttable_Counts per batch × time_point × replicate × DSB × repeat-class (first 20 rows)
batch time_point replicate DSB repeat Uncuttable_Counts
4 0 1 DSB1 ALL 28178
4 0 1 DSB2 ALL 8847
4 0 1 TRANS ALL 115
4 120 1 DSB1 ALL 103838
4 120 1 DSB2 ALL 23505
4 120 1 TRANS ALL 2106
6 0 1 DSB1 ALL 13004
6 0 1 DSB1 INTACT 23997
6 0 1 DSB1 SSA 15119
6 0 1 DSB2 ALL 2571
6 0 1 DSB2 INTACT 21258
6 0 1 DSB2 SSA 10877
6 0 1 TRANS ALL 482
6 0 1 TRANS INTACT 524
6 0 1 TRANS SSA 140
6 120 1 DSB1 ALL 20548
6 120 1 DSB2 ALL 2325
6 120 1 TRANS ALL 731
6 180 1 DSB1 INTACT 22097
6 180 1 DSB1 SSA 91176
if (any(dat_uncuttable_denoms$Uncuttable_Counts <= 0, na.rm = TRUE)) {
  message("Note: Some batch × time_point × DSB groups have Uncuttable_Counts = 0; Percent_of_Uncuttable will be NA for those groups.")
}

# Normalize each allele×combo count by the uncuttable denominator for that DSB group
dat_uncut_norm <- dat_allele_combo_counts %>%
  left_join(dat_uncuttable_denoms, by = c("batch", "time_point", "replicate", "DSB", "repeat")) %>%
  mutate(
    Percent_of_Uncuttable = if_else(Uncuttable_Counts > 0,
                                   100 * Counts / Uncuttable_Counts,
                                   NA_real_)
  )

# Diagnostic: it is possible (and sometimes expected) for % of NO_CUT to exceed 100,
# because this metric is a ratio to the NO_CUT signal, not a within-group probability.
# Still, values >>100 can also indicate a very small NO_CUT denominator or a labeling mismatch.
over_100 <- dat_uncut_norm %>%
  filter(!is.na(Percent_of_Uncuttable), Percent_of_Uncuttable > 100) %>%
  arrange(desc(Percent_of_Uncuttable))

if (nrow(over_100) == 0) {
  message("No allele×combo rows exceed 100% of NO_CUT.")
} else {
  over_100 %>%
    transmute(
      batch,
      time_point,
      DSB,
      allele,
      combo,
      Counts,
      Uncuttable_Counts,
      Percent_of_Uncuttable = round(Percent_of_Uncuttable, 2)
    ) %>%
    head(25) %>%
    knitr::kable(caption = "Rows where % of NO_CUT > 100 (top 25). Large values usually mean NO_CUT counts are low for that batch/time/DSB.")
}
Rows where % of NO_CUT > 100 (top 25). Large values usually mean NO_CUT counts are low for that batch/time/DSB.
batch time_point DSB allele combo Counts Uncuttable_Counts Percent_of_Uncuttable
8 0 DSB2 COMMON C_to_D 1044235 3550 29415.07
8 180 DSB2 COMMON C_to_D 527722 5283 9989.06
6 0 DSB2 COMMON C_to_D 1715641 21258 8070.57
6 180 TRANS COMMON C_to_B 52528 781 6725.74
6 0 DSB2 Chr4 C_to_D 164802 2571 6410.04
4 0 DSB2 Chr7 C_to_D 449843 8847 5084.70
6 180 DSB2 COMMON C_to_D 563623 11308 4984.29
8 180 TRANS COMMON C_to_B 70029 1474 4750.95
8 0 DSB2 Chr15 C_to_D 213677 4936 4328.95
6 180 TRANS COMMON C_to_B 14469 366 3953.28
8 180 DSB2 COMMON C_to_D 1155167 34704 3328.63
6 120 DSB2 Chr4 C_to_D 68728 2325 2956.04
8 180 TRANS CHRXV_L01 A_to_D 34978 1474 2373.00
8 180 TRANS COMMON C_to_B 9081 421 2157.01
6 180 TRANS CHRIV_L10_2 A_to_D 15870 781 2032.01
4 120 DSB2 Chr7 C_to_D 460116 23505 1957.52
6 180 TRANS CHRIII_L04 A_to_D 14606 781 1870.17
6 180 DSB2 COMMON C_to_D 1117160 64040 1744.47
6 180 TRANS CHRXII_L05 A_to_D 13382 781 1713.44
6 180 TRANS CHRXV_L01 A_to_D 12888 781 1650.19
8 180 TRANS CHRXII_L12 A_to_D 19413 1474 1317.03
8 180 TRANS CHRXV_L01 A_to_D 5302 421 1259.38
6 180 TRANS CHRXII_L07 A_to_D 9125 781 1168.37
6 180 TRANS CHRIII_L03 A_to_D 7997 781 1023.94
6 0 DSB1 CHRIII_L04 A_to_B 228463 23997 952.05
# ---- Group-level CIS/TRANS after NO_CUT normalization ----
# Note: NO_CUT normalization rescales counts within each (batch × time_point × DSB).
# That means CIS and TRANS *fractions of total* are unchanged, but CIS/TRANS as
# a percent of NO_CUT are newly normalized and can be compared across groups.
cis_combos <- c("A_to_B", "C_to_D")
trans_combos <- c("A_to_D", "C_to_B")

dat_group_combo_counts <- dat_raw %>%
  filter(combo %in% combos_4) %>%
  group_by(batch, time_point, replicate, DSB, `repeat`, combo) %>%
  summarise(Counts = sum(count, na.rm = TRUE), .groups = "drop") %>%
  mutate(combo = factor(combo, levels = combos_4))

dat_group_cistrans_norm <- dat_group_combo_counts %>%
  group_by(batch, time_point, replicate, DSB, `repeat`) %>%
  summarise(
    Cis_Counts_4 = sum(Counts[combo %in% cis_combos], na.rm = TRUE),
    Trans_Counts_4 = sum(Counts[combo %in% trans_combos], na.rm = TRUE),
    .groups = "drop"
  ) %>%
  left_join(dat_uncuttable_denoms, by = c("batch", "time_point", "replicate", "DSB", "repeat")) %>%
  mutate(
    Percent_Cis_of_NO_CUT = if_else(Uncuttable_Counts > 0, 100 * Cis_Counts_4 / Uncuttable_Counts, NA_real_),
    Percent_Trans_of_NO_CUT = if_else(Uncuttable_Counts > 0, 100 * Trans_Counts_4 / Uncuttable_Counts, NA_real_)
  )

dat_group_cistrans_norm %>%
  arrange(batch, time_point, replicate, DSB, `repeat`) %>%
  head(20) %>%
  knitr::kable(caption = "Preview: CIS and TRANS (4-combo) normalized to NO_CUT (% of NO_CUT; first 20 rows)")
Preview: CIS and TRANS (4-combo) normalized to NO_CUT (% of NO_CUT; first 20 rows)
batch time_point replicate DSB repeat Cis_Counts_4 Trans_Counts_4 Uncuttable_Counts Percent_Cis_of_NO_CUT Percent_Trans_of_NO_CUT
4 0 1 DSB1 ALL 499775 0 28178 1773.6355 0.000
4 0 1 DSB2 ALL 458690 0 8847 5184.6954 0.000
4 0 1 TRANS ALL 0 3754 115 0.0000 3264.348
4 120 1 DSB1 ALL 538320 0 103838 518.4229 0.000
4 120 1 DSB2 ALL 483621 0 23505 2057.5239 0.000
4 120 1 TRANS ALL 0 31673 2106 0.0000 1503.941
6 0 1 DSB1 ALL 223186 0 13004 1716.2873 0.000
6 0 1 DSB1 INTACT 1193579 0 23997 4973.8676 0.000
6 0 1 DSB1 SSA 44570 0 15119 294.7946 0.000
6 0 1 DSB2 ALL 167373 0 2571 6510.0350 0.000
6 0 1 DSB2 INTACT 1741835 0 21258 8193.7859 0.000
6 0 1 DSB2 SSA 35803 0 10877 329.1625 0.000
6 0 1 TRANS ALL 0 11169 482 0.0000 2317.220
6 0 1 TRANS INTACT 0 6013 524 0.0000 1147.519
6 0 1 TRANS SSA 0 1434 140 0.0000 1024.286
6 120 1 DSB1 ALL 169434 0 20548 824.5766 0.000
6 120 1 DSB2 ALL 71053 0 2325 3056.0430 0.000
6 120 1 TRANS ALL 0 12644 731 0.0000 1729.685
6 180 1 DSB1 INTACT 504314 0 22097 2282.2736 0.000
6 180 1 DSB1 SSA 1045209 0 91176 1146.3642 0.000
dat_uncut_norm |>
  arrange(batch, time_point, replicate, DSB, `repeat`, allele, combo) |>
  head(12) |>
  knitr::kable(caption = "Preview: allele×combo counts normalized to uncuttable (% of uncuttable; first 12 rows)")
Preview: allele×combo counts normalized to uncuttable (% of uncuttable; first 12 rows)
batch time_point replicate DSB repeat allele combo Counts Uncuttable_Counts Percent_of_Uncuttable
4 0 1 DSB1 ALL Chr12_L01 A_to_B 23365 28178 82.9192987
4 0 1 DSB1 ALL Chr12_L05 A_to_B 15119 28178 53.6553339
4 0 1 DSB1 ALL Chr12_L07 A_to_B 19481 28178 69.1354958
4 0 1 DSB1 ALL Chr12_L12 A_to_B 22779 28178 80.8396621
4 0 1 DSB1 ALL Chr15_L01 A_to_B 27868 28178 98.8998509
4 0 1 DSB1 ALL Chr15_L04 A_to_B 54 28178 0.1916389
4 0 1 DSB1 ALL Chr15_L15 A_to_B 14456 28178 51.3024345
4 0 1 DSB1 ALL Chr15_L17_2 A_to_B 28338 28178 100.5678189
4 0 1 DSB1 ALL Chr15_L18 A_to_B 4689 28178 16.6406416
4 0 1 DSB1 ALL Chr3_L02 A_to_B 26669 28178 94.6447583
4 0 1 DSB1 ALL Chr3_L03 A_to_B 19 28178 0.0674285
4 0 1 DSB1 ALL Chr3_L04 A_to_B 38319 28178 135.9890695
# Plot: percent-of-uncuttable by allele, per batch.
# Requested: make DSB1 and DSB2 separate plots (and also include TRANS), and add data labels.
plot_df <- dat_uncut_norm %>%
  filter(!is.na(Percent_of_Uncuttable)) %>%
  mutate(
    DSB = factor(DSB, levels = c("DSB1", "DSB2", "TRANS")),
    Label = if_else(
      is.na(Percent_of_Uncuttable),
      NA_character_,
      paste0(round(Percent_of_Uncuttable, 1), "%")
    )
  )

if (nrow(plot_df) == 0) {
  message("No uncuttable-normalized values to plot (Percent_of_Uncuttable is all NA).")
} else {
  batches <- plot_df %>% distinct(batch) %>% arrange(batch) %>% pull(batch)

  for (b in batches) {
    df_b <- plot_df %>% filter(batch == b)
    if (nrow(df_b) == 0) next

    for (dsb_name in c("DSB1", "DSB2", "TRANS")) {
      df_bd <- df_b %>% filter(DSB == dsb_name)
      if (nrow(df_bd) == 0) next

      p <- ggplot(df_bd, aes(x = allele, y = Percent_of_Uncuttable, fill = combo)) +
        geom_col(position = position_dodge(width = 0.9), width = 0.85) +
        geom_text(
          aes(label = Label),
          position = position_dodge(width = 0.9),
          vjust = -0.25,
          size = 2.6,
          angle = 90
        ) +
        facet_grid(`repeat` ~ time_point, scales = "free_x") +
        theme_bw() +
        theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
        labs(
          title = paste0("Uncuttable-normalized allele×combo (% of NO_CUT) | Batch: ", b, " | ", dsb_name),
          x = "Allele",
          y = paste0("% of NO_CUT (within batch × time_point × ", dsb_name, ")"),
          fill = "Combo"
        )

      print(p)
    }
  }
}

# Plot: CIS% of NO_CUT and TRANS% of NO_CUT (group-level), separately for DSB1, DSB2, and TRANS
plot_ct <- dat_group_cistrans_norm %>%
  filter(!is.na(Percent_Cis_of_NO_CUT) | !is.na(Percent_Trans_of_NO_CUT)) %>%
  pivot_longer(
    cols = c(Percent_Cis_of_NO_CUT, Percent_Trans_of_NO_CUT),
    names_to = "Class",
    values_to = "Percent"
  ) %>%
  mutate(
    Class = recode(Class,
                   Percent_Cis_of_NO_CUT = "CIS",
                   Percent_Trans_of_NO_CUT = "TRANS"),
    Label = if_else(is.na(Percent), NA_character_, paste0(round(Percent, 1), "%")),
    DSB = factor(DSB, levels = c("DSB1", "DSB2", "TRANS"))
  )

if (nrow(plot_ct) == 0) {
  message("No group-level CIS/TRANS NO_CUT-normalized values to plot.")
} else {
  batches_ct <- plot_ct %>% distinct(batch) %>% arrange(batch) %>% pull(batch)
  for (b in batches_ct) {
    df_b <- plot_ct %>% filter(batch == b)
    if (nrow(df_b) == 0) next

    for (dsb_name in c("DSB1", "DSB2", "TRANS")) {
      df_bd <- df_b %>% filter(DSB == dsb_name)
      if (nrow(df_bd) == 0) next

      p <- ggplot(df_bd, aes(x = factor(time_point), y = Percent, fill = Class)) +
        geom_col(position = position_dodge(width = 0.9), width = 0.8) +
        geom_text(
          aes(label = Label),
          position = position_dodge(width = 0.9),
          vjust = -0.25,
          size = 3.0
        ) +
        theme_bw() +
        labs(
          title = paste0("CIS% and TRANS% of NO_CUT (4-combo totals) | Batch: ", b, " | ", dsb_name),
          x = "Time point",
          y = "% of NO_CUT",
          fill = ""
        )

      print(p)
    }
  }
}


3.6 Two-step normalization: NO_CUT baseline + 1% spike-in anchor

This section implements the two-step scheme discussed:

  1. Normalize each feature to its DSB-specific NO_CUT total (background/intact baseline)
  2. Scale those NO_CUT-normalized ratios using the 1% spike-in in the same DSB and repeat-class to approximate a per-cell-like quantity.

For a feature count \(C\) in a given (batch × time × replicate × DSB × repeat-class), and the DSB-specific NO_CUT total \(NC\):

\[R_{\text{NOCUT}} = \frac{C}{NC}\]

In these PD summary CSVs, the 1% control appears as an internal spike-in allele (e.g. alleles containing 1PERCENT_CONTROL). Let \(S\) be the total spike-in counts in the same (DSB × repeat-class). A useful per-cell-like scaling is:

\[R_{\text{final}} = \frac{C}{S} \quad\text{and}\quad R_{\text{final,per cell}} = 0.01\cdot\frac{C}{S}\]

This corresponds to: normalize to NO_CUT, then rescale by the spike-in in that DSB/class (algebraically, the NO_CUT denominator cancels when both are taken in the same group).

# Two-step scheme for the PD CSVs:
# - NO_CUT is represented as allele == "NO_CUT" (split by repeat-class and DSB)
# - The 1% spike-in is represented as alleles like "*_1PERCENT_CONTROL" (also split by repeat-class and DSB)
#
# For each (batch × time_point × replicate × DSB × repeat-class):
#   Step 1: R_NOCUT = C_feat / C_NOCUT_total
#   Step 2: scale using spike-in: PerCell ~= 0.01 * C_feat / C_spike
#          (equivalently: 0.01 * R_NOCUT / (C_spike / C_NOCUT_total))

if (!exists("uncuttable_regex")) {
  uncuttable_pattern <- "NO[_\\W]*CUT|NOCUT|UNCUT"
  uncuttable_regex <- stringr::regex(uncuttable_pattern, ignore_case = TRUE)
}
if (!exists("percent_control_regex")) {
  percent_control_regex <- stringr::regex("1PERCENT|PERCENTCONTROL|PERCENT_CONTROL", ignore_case = TRUE)
}

combos_4 <- c("A_to_B", "C_to_D", "A_to_D", "C_to_B")

df_lab <- dat_raw %>%
  mutate(
    .allele_chr = as.character(allele),
    .aln_chr = as.character(alignment_name),
    is_spike = stringr::str_detect(.allele_chr, percent_control_regex) | stringr::str_detect(.aln_chr, percent_control_regex),
    is_nocut_raw = stringr::str_detect(.allele_chr, uncuttable_regex) | stringr::str_detect(.aln_chr, stringr::regex("NO[_\\W]*CUT|NOCUT", ignore_case = TRUE))
  )

# Heuristic to avoid counting cross-DSB NO_CUT rows (e.g. "DSB2_NO_CUT" entries under DSB1)
df_lab <- df_lab %>%
  mutate(
    is_nocut = case_when(
      DSB == "DSB1" ~ is_nocut_raw & !stringr::str_detect(.aln_chr, stringr::regex("DSB2[_\\W]*NO[_\\W]*CUT", ignore_case = TRUE)),
      DSB == "DSB2" ~ is_nocut_raw & !stringr::str_detect(.aln_chr, stringr::regex("DSB1[_\\W]*NO[_\\W]*CUT", ignore_case = TRUE)),
      TRUE ~ is_nocut_raw
    )
  )

df_lab %>%
  summarise(
    Rows = n(),
    Rows_spike = sum(is_spike, na.rm = TRUE),
    Rows_nocut = sum(is_nocut, na.rm = TRUE)
  ) %>%
  knitr::kable(caption = "Diagnostic: detected NO_CUT and 1% spike-in rows in dat_raw")
Diagnostic: detected NO_CUT and 1% spike-in rows in dat_raw
Rows Rows_spike Rows_nocut
1107 23 63
# Totals per group
group_denoms <- df_lab %>%
  group_by(batch, time_point, replicate, DSB, `repeat`) %>%
  summarise(
    C_NOCUT_total = sum(if_else(is_nocut & !is_spike, count, 0), na.rm = TRUE),
    C_spike_total = sum(if_else(is_spike, count, 0), na.rm = TRUE),
    .groups = "drop"
  )

group_denoms %>%
  summarise(
    Groups = n(),
    Groups_with_NOCUT = sum(C_NOCUT_total > 0, na.rm = TRUE),
    Groups_with_spike = sum(C_spike_total > 0, na.rm = TRUE)
  ) %>%
  knitr::kable(caption = "Diagnostic: denominator availability by (batch × time × replicate × DSB × repeat-class)")
Diagnostic: denominator availability by (batch × time × replicate × DSB × repeat-class)
Groups Groups_with_NOCUT Groups_with_spike
42 42 21
group_denoms %>%
  arrange(batch, time_point, replicate, DSB, `repeat`) %>%
  head(20) %>%
  knitr::kable(caption = "Preview: NO_CUT totals and spike-in totals (first 20 groups)")
Preview: NO_CUT totals and spike-in totals (first 20 groups)
batch time_point replicate DSB repeat C_NOCUT_total C_spike_total
4 0 1 DSB1 ALL 28178 0
4 0 1 DSB2 ALL 8847 0
4 0 1 TRANS ALL 115 0
4 120 1 DSB1 ALL 103838 0
4 120 1 DSB2 ALL 23505 0
4 120 1 TRANS ALL 2106 0
6 0 1 DSB1 ALL 13004 0
6 0 1 DSB1 INTACT 23961 7649
6 0 1 DSB1 SSA 15117 187
6 0 1 DSB2 ALL 2571 0
6 0 1 DSB2 INTACT 21082 33
6 0 1 DSB2 SSA 10874 0
6 0 1 TRANS ALL 482 0
6 0 1 TRANS INTACT 524 31
6 0 1 TRANS SSA 140 9
6 120 1 DSB1 ALL 20548 0
6 120 1 DSB2 ALL 2325 0
6 120 1 TRANS ALL 731 0
6 180 1 DSB1 INTACT 22085 3927
6 180 1 DSB1 SSA 91150 11342
# Feature counts (allele × combo)
feat_counts <- df_lab %>%
  filter(combo %in% combos_4) %>%
  group_by(batch, time_point, replicate, DSB, `repeat`, allele, combo) %>%
  summarise(C_feat = sum(count, na.rm = TRUE), .groups = "drop")

feat_norm <- feat_counts %>%
  left_join(group_denoms, by = c("batch", "time_point", "replicate", "DSB", "repeat")) %>%
  mutate(
    R_NOCUT = if_else(C_NOCUT_total > 0, C_feat / C_NOCUT_total, NA_real_),
    Spike_per_NOCUT = if_else(C_NOCUT_total > 0, C_spike_total / C_NOCUT_total, NA_real_),
    R_final = if_else(C_spike_total > 0, C_feat / C_spike_total, NA_real_),
    R_final_per_cell = 0.01 * R_final
  )

feat_norm %>%
  arrange(batch, time_point, replicate, DSB, `repeat`, allele, combo) %>%
  head(20) %>%
  knitr::kable(caption = "Preview: two-step outputs. R_NOCUT = C_feat / NO_CUT_total; R_final = C_feat / spike_total; R_final_per_cell = 0.01 * R_final (first 20).")
Preview: two-step outputs. R_NOCUT = C_feat / NO_CUT_total; R_final = C_feat / spike_total; R_final_per_cell = 0.01 * R_final (first 20).
batch time_point replicate DSB repeat allele combo C_feat C_NOCUT_total C_spike_total R_NOCUT Spike_per_NOCUT R_final R_final_per_cell
4 0 1 DSB1 ALL Chr12_L01 A_to_B 23365 28178 0 0.8291930 0 NA NA
4 0 1 DSB1 ALL Chr12_L05 A_to_B 15119 28178 0 0.5365533 0 NA NA
4 0 1 DSB1 ALL Chr12_L07 A_to_B 19481 28178 0 0.6913550 0 NA NA
4 0 1 DSB1 ALL Chr12_L12 A_to_B 22779 28178 0 0.8083966 0 NA NA
4 0 1 DSB1 ALL Chr15_L01 A_to_B 27868 28178 0 0.9889985 0 NA NA
4 0 1 DSB1 ALL Chr15_L04 A_to_B 54 28178 0 0.0019164 0 NA NA
4 0 1 DSB1 ALL Chr15_L15 A_to_B 14456 28178 0 0.5130243 0 NA NA
4 0 1 DSB1 ALL Chr15_L17_2 A_to_B 28338 28178 0 1.0056782 0 NA NA
4 0 1 DSB1 ALL Chr15_L18 A_to_B 4689 28178 0 0.1664064 0 NA NA
4 0 1 DSB1 ALL Chr3_L02 A_to_B 26669 28178 0 0.9464476 0 NA NA
4 0 1 DSB1 ALL Chr3_L03 A_to_B 19 28178 0 0.0006743 0 NA NA
4 0 1 DSB1 ALL Chr3_L04 A_to_B 38319 28178 0 1.3598907 0 NA NA
4 0 1 DSB1 ALL Chr4_L01 A_to_B 16192 28178 0 0.5746327 0 NA NA
4 0 1 DSB1 ALL Chr4_L03 A_to_B 19387 28178 0 0.6880190 0 NA NA
4 0 1 DSB1 ALL Chr4_L06_2 A_to_B 18430 28178 0 0.6540564 0 NA NA
4 0 1 DSB1 ALL Chr4_L10_2 A_to_B 45340 28178 0 1.6090567 0 NA NA
4 0 1 DSB1 ALL Chr4_L12_2 A_to_B 22582 28178 0 0.8014054 0 NA NA
4 0 1 DSB1 ALL Chr4_L13 A_to_B 19743 28178 0 0.7006530 0 NA NA
4 0 1 DSB1 ALL Chr4_L15_2 A_to_B 21694 28178 0 0.7698914 0 NA NA
4 0 1 DSB1 ALL Chr7_990K A_to_B 15400 28178 0 0.5465257 0 NA NA
# Plot: per-cell-like abundance scaled by spike-in
plot_final <- feat_norm %>%
  filter(!is.na(R_final_per_cell)) %>%
  filter(!stringr::str_detect(as.character(allele), percent_control_regex)) %>%
  mutate(
    DSB = factor(DSB, levels = c("DSB1", "DSB2", "TRANS")),
    combo = factor(combo, levels = combos_4),
    Label = if_else(is.na(R_final_per_cell), NA_character_, format(round(R_final_per_cell, 4), nsmall = 4))
  )

if (nrow(plot_final) == 0) {
  message("No R_final_per_cell values to plot. Check that spike-in rows (1PERCENT_CONTROL) and NO_CUT rows are present with nonzero counts.")
} else {
  batches <- plot_final %>% distinct(batch) %>% arrange(batch) %>% pull(batch)
  for (b in batches) {
    df_b <- plot_final %>% filter(batch == b)
    for (dsb_name in c("DSB1", "DSB2", "TRANS")) {
      df_bd <- df_b %>% filter(DSB == dsb_name)
      if (nrow(df_bd) == 0) next

      p <- ggplot(df_bd, aes(x = allele, y = R_final_per_cell, fill = combo)) +
        geom_col(position = position_dodge(width = 0.9), width = 0.85) +
        geom_text(
          aes(label = Label),
          position = position_dodge(width = 0.9),
          vjust = -0.25,
          size = 2.4,
          angle = 90
        ) +
        facet_grid(`repeat` ~ time_point, scales = "free_x") +
        theme_bw() +
        theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
        labs(
          title = paste0("Two-step scaled per-cell-like abundance (0.01 * C_feat / spike_total) | Batch: ", b, " | ", dsb_name),
          x = "Allele",
          y = "Approx per-cell abundance (relative units)",
          fill = "Combo"
        )

      print(p)
    }
  }
}